<!DOCTYPE HTML>
<html lang="en" >
    <!-- Start book Python爬虫课程讲义 -->
    <head>
        <!-- head:start -->
        <meta charset="UTF-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge" />
        <title>(案例二)阳光热线问政平台爬虫 | Python爬虫课程讲义</title>
        <meta content="text/html; charset=utf-8" http-equiv="Content-Type">
        <meta name="description" content="">
        <meta name="generator" content="GitBook 2.6.7">
        <meta name="author" content="BigCat">
        
        <meta name="HandheldFriendly" content="true"/>
        <meta name="viewport" content="width=device-width, initial-scale=1, user-scalable=no">
        <meta name="apple-mobile-web-app-capable" content="yes">
        <meta name="apple-mobile-web-app-status-bar-style" content="black">
        <link rel="apple-touch-icon-precomposed" sizes="152x152" href="../../gitbook/images/apple-touch-icon-precomposed-152.png">
        <link rel="shortcut icon" href="../../gitbook/images/favicon.ico" type="image/x-icon">
        
    <link rel="stylesheet" href="../../gitbook/style.css">
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-tbfed-pagefooter/footer.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-splitter/splitter.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-highlight/website.css">
        
    
        
        <link rel="stylesheet" href="../../gitbook/plugins/gitbook-plugin-fontsettings/website.css">
        
    
    

        
    
    
    <link rel="next" href="../../file/part05/5.3.html" />
    
    
    <link rel="prev" href="../../file/part05/5.1.html" />
    

        <!-- head:end -->
    </head>
    <body>
        <!-- body:start -->
        
    <div class="book"
        data-level="5.2"
        data-chapter-title="(案例二)阳光热线问政平台爬虫"
        data-filepath="file/part05/5.2.md"
        data-basepath="../.."
        data-revision="Thu Feb 09 2017 09:48:59 GMT+0800 (CST)"
        data-innerlanguage="">
    

<div class="book-summary">
    <nav role="navigation">
        <ul class="summary">
            
            
            
            

            

            
    
        <li class="chapter " data-level="0" data-path="index.html">
            
                
                    <a href="../../index.html">
                
                        <i class="fa fa-check"></i>
                        
                        传智播客Python学院爬虫课程
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1" data-path="file/part01/1.html">
            
                
                    <a href="../../file/part01/1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.</b>
                        
                        爬虫原理与数据抓取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="1.1" data-path="file/part01/1.1.html">
            
                
                    <a href="../../file/part01/1.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.1.</b>
                        
                        (了解)通用爬虫和聚焦爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.2" data-path="file/part01/1.2.html">
            
                
                    <a href="../../file/part01/1.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.2.</b>
                        
                        (复习)HTTP/HTTPS的请求与响应
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.3" data-path="file/part01/1.3.html">
            
                
                    <a href="../../file/part01/1.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.3.</b>
                        
                        HTTP/HTTPS抓包工具-Fiddler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.4" data-path="file/part01/1.4.html">
            
                
                    <a href="../../file/part01/1.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.4.</b>
                        
                        urllib2模块的基本使用
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.5" data-path="file/part01/1.5.html">
            
                
                    <a href="../../file/part01/1.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.5.</b>
                        
                        urllib2：GET请求和POST请求
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.6" data-path="file/part01/1.6.html">
            
                
                    <a href="../../file/part01/1.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.6.</b>
                        
                        urllib2：Handler处理器和自定义Opener
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.7" data-path="file/part01/1.7.html">
            
                
                    <a href="../../file/part01/1.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.7.</b>
                        
                        urllib2：URLError与HTTPError
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="1.8" data-path="file/part01/1.8.html">
            
                
                    <a href="../../file/part01/1.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>1.8.</b>
                        
                        Requests模块
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="2" data-path="file/part02/2.html">
            
                
                    <a href="../../file/part02/2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.</b>
                        
                        非结构化数据与结构化数据提取
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="2.1" data-path="file/part02/2.1.html">
            
                
                    <a href="../../file/part02/2.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.1.</b>
                        
                        正则表达式re模块
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.2" data-path="file/part02/2.2.html">
            
                
                    <a href="../../file/part02/2.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.2.</b>
                        
                        案例：使用正则表达式的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.3" data-path="file/part02/2.3.html">
            
                
                    <a href="../../file/part02/2.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.3.</b>
                        
                        XPath与lxml类库
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.4" data-path="file/part02/2.4.html">
            
                
                    <a href="../../file/part02/2.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.4.</b>
                        
                        案例：使用XPath的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.5" data-path="file/part02/2.5.html">
            
                
                    <a href="../../file/part02/2.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.5.</b>
                        
                        BeautifulSoup4 解析器
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.6" data-path="file/part02/2.6.html">
            
                
                    <a href="../../file/part02/2.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.6.</b>
                        
                        案例：使用bs4的爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.7" data-path="file/part02/2.7.html">
            
                
                    <a href="../../file/part02/2.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.7.</b>
                        
                        JSON模块与JsonPath
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.8" data-path="file/part02/2.8.html">
            
                
                    <a href="../../file/part02/2.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.8.</b>
                        
                        糗事百科案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="2.9" data-path="file/part02/2.9.html">
            
                
                    <a href="../../file/part02/2.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>2.9.</b>
                        
                        多线程爬虫案例
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="3" data-path="file/part03/3.html">
            
                
                    <a href="../../file/part03/3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.</b>
                        
                        动态HTML处理和机器图像识别
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="3.1" data-path="file/part03/3.1.html">
            
                
                    <a href="../../file/part03/3.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.1.</b>
                        
                        动态HTML介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.2" data-path="file/part03/3.2.html">
            
                
                    <a href="../../file/part03/3.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.2.</b>
                        
                        Selenium与PhantomJS
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.3" data-path="file/part03/3.3.html">
            
                
                    <a href="../../file/part03/3.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.3.</b>
                        
                        案例一：网站模拟登录
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.4" data-path="file/part03/3.4.html">
            
                
                    <a href="../../file/part03/3.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.4.</b>
                        
                        案例二：动态页面模拟点击
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.5" data-path="file/part03/3.5.html">
            
                
                    <a href="../../file/part03/3.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.5.</b>
                        
                        案例三：执行JavaScript语句
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.6" data-path="file/part03/3.6.html">
            
                
                    <a href="../../file/part03/3.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.6.</b>
                        
                        机器视觉与Tesseract介绍
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.7" data-path="file/part03/3.7.html">
            
                
                    <a href="../../file/part03/3.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.7.</b>
                        
                        处理一些格式规范的文字
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.8" data-path="file/part03/3.8.html">
            
                
                    <a href="../../file/part03/3.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.8.</b>
                        
                        案例：尝试对验证码进行机器识别处理
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="3.9" data-path="file/part03/3.9.html">
            
                
                    <a href="../../file/part03/3.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>3.9.</b>
                        
                        机器学习：训练Tesseract
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="4" data-path="file/part04/4.html">
            
                
                    <a href="../../file/part04/4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.</b>
                        
                        Scrapy框架
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="4.1" data-path="file/part04/4.1.html">
            
                
                    <a href="../../file/part04/4.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.1.</b>
                        
                        配置安装
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.2" data-path="file/part04/4.2.html">
            
                
                    <a href="../../file/part04/4.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.2.</b>
                        
                        入门案例
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.3" data-path="file/part04/4.3.html">
            
                
                    <a href="../../file/part04/4.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.3.</b>
                        
                        Scrapy Shell
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.4" data-path="file/part04/4.4.html">
            
                
                    <a href="../../file/part04/4.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.4.</b>
                        
                        Item Pipeline
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.5" data-path="file/part04/4.5.html">
            
                
                    <a href="../../file/part04/4.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.5.</b>
                        
                        Spiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.6" data-path="file/part04/4.6.html">
            
                
                    <a href="../../file/part04/4.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.6.</b>
                        
                        CrawlSpiders
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.7" data-path="file/part04/4.7.html">
            
                
                    <a href="../../file/part04/4.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.7.</b>
                        
                        Request/Response
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.8" data-path="file/part04/4.8.html">
            
                
                    <a href="../../file/part04/4.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.8.</b>
                        
                        Downloader Middlewares
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="4.9" data-path="file/part04/4.9.html">
            
                
                    <a href="../../file/part04/4.9.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>4.9.</b>
                        
                        Settings
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="5" data-path="file/part05/5.html">
            
                
                    <a href="../../file/part05/5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.</b>
                        
                        Scrapy实战项目
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="5.1" data-path="file/part05/5.1.html">
            
                
                    <a href="../../file/part05/5.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.1.</b>
                        
                        (案例一)手机App抓包爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter active" data-level="5.2" data-path="file/part05/5.2.html">
            
                
                    <a href="../../file/part05/5.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.2.</b>
                        
                        (案例二)阳光热线问政平台爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.3" data-path="file/part05/5.3.html">
            
                
                    <a href="../../file/part05/5.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.3.</b>
                        
                        (案例三)新浪网分类资讯爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.4" data-path="file/part05/5.4.html">
            
                
                    <a href="../../file/part05/5.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.4.</b>
                        
                        (案例四)图片下载器爬虫
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.5" data-path="file/part05/5.5.html">
            
                
                    <a href="../../file/part05/5.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.5.</b>
                        
                        (案例五)将数据保存在MongoDB中
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.6" data-path="file/part05/5.6.html">
            
                
                    <a href="../../file/part05/5.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.6.</b>
                        
                        (案例六)三种scrapy模拟登陆策略
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="5.7" data-path="file/part05/5.7.html">
            
                
                    <a href="../../file/part05/5.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>5.7.</b>
                        
                        附：通过Fiddler进行手机抓包方法
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="6" data-path="file/part06/6.html">
            
                
                    <a href="../../file/part06/6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.</b>
                        
                        scrapy-redis分布式组件
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="6.1" data-path="file/part06/6.1.html">
            
                
                    <a href="../../file/part06/6.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.1.</b>
                        
                        源码分析参考：Connection
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.2" data-path="file/part06/6.2.html">
            
                
                    <a href="../../file/part06/6.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.2.</b>
                        
                        源码分析参考：Dupefilter
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.3" data-path="file/part06/6.3.html">
            
                
                    <a href="../../file/part06/6.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.3.</b>
                        
                        源码分析参考：Picklecompat
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.4" data-path="file/part06/6.4.html">
            
                
                    <a href="../../file/part06/6.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.4.</b>
                        
                        源码分析参考：Pipelines
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.5" data-path="file/part06/6.5.html">
            
                
                    <a href="../../file/part06/6.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.5.</b>
                        
                        源码分析参考：Queue
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.6" data-path="file/part06/6.6.html">
            
                
                    <a href="../../file/part06/6.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.6.</b>
                        
                        源码分析参考：Scheduler
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="6.7" data-path="file/part06/6.7.html">
            
                
                    <a href="../../file/part06/6.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>6.7.</b>
                        
                        源码分析参考：Spider
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="7" data-path="file/part07/7.html">
            
                
                    <a href="../../file/part07/7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.</b>
                        
                        scrapy-redis实战
                    </a>
            
            
            <ul class="articles">
                
    
        <li class="chapter " data-level="7.1" data-path="file/part07/7.1.html">
            
                
                    <a href="../../file/part07/7.1.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.1.</b>
                        
                        源码自带项目说明
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.2" data-path="file/part07/7.2.html">
            
                
                    <a href="../../file/part07/7.2.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.2.</b>
                        
                        有缘网分布式爬虫项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.3" data-path="file/part07/7.3.html">
            
                
                    <a href="../../file/part07/7.3.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.3.</b>
                        
                        有缘网分布式爬虫项目2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.4" data-path="file/part07/7.4.html">
            
                
                    <a href="../../file/part07/7.4.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.4.</b>
                        
                        处理Redis里的数据
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.5" data-path="file/part07/7.5.html">
            
                
                    <a href="../../file/part07/7.5.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.5.</b>
                        
                        尝试改写新浪网分类资讯爬虫1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.6" data-path="file/part07/7.6.html">
            
                
                    <a href="../../file/part07/7.6.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.6.</b>
                        
                        尝试改写新浪网分类资讯爬虫2
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.7" data-path="file/part07/7.7.html">
            
                
                    <a href="../../file/part07/7.7.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.7.</b>
                        
                        IT桔子分布式项目1
                    </a>
            
            
        </li>
    
        <li class="chapter " data-level="7.8" data-path="file/part07/7.8.html">
            
                
                    <a href="../../file/part07/7.8.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>7.8.</b>
                        
                        IT桔子分布式项目2
                    </a>
            
            
        </li>
    

            </ul>
            
        </li>
    
        <li class="chapter " data-level="8" data-path="file/duanzi/duanzi.html">
            
                
                    <a href="../../file/duanzi/duanzi.html">
                
                        <i class="fa fa-check"></i>
                        
                            <b>8.</b>
                        
                        课余段子
                    </a>
            
            
        </li>
    


            
            <li class="divider"></li>
            <li>
                <a href="https://www.gitbook.com" target="blank" class="gitbook-link">
                    Published with GitBook
                </a>
            </li>
            
        </ul>
    </nav>
</div>

    <div class="book-body">
        <div class="body-inner">
            <div class="book-header" role="navigation">
    <!-- Actions Left -->
    

    <!-- Title -->
    <h1>
        <i class="fa fa-circle-o-notch fa-spin"></i>
        <a href="../../" >Python爬虫课程讲义</a>
    </h1>
</div>

            <div class="page-wrapper" tabindex="-1" role="main">
                <div class="page-inner">
                
                
                    <section class="normal" id="section-">
                    
                        <h2 id="&#x9633;&#x5149;&#x70ED;&#x7EBF;&#x95EE;&#x653F;&#x5E73;&#x53F0;">&#x9633;&#x5149;&#x70ED;&#x7EBF;&#x95EE;&#x653F;&#x5E73;&#x53F0;</h2>
<p><a href="http://wz.sun0769.com/index.php/question/questionType?type=4" target="_blank">http://wz.sun0769.com/index.php/question/questionType?type=4</a></p>
<p>&#x722C;&#x53D6;&#x6295;&#x8BC9;&#x5E16;&#x5B50;&#x7684;&#x7F16;&#x53F7;&#x3001;&#x5E16;&#x5B50;&#x7684;url&#x3001;&#x5E16;&#x5B50;&#x7684;&#x6807;&#x9898;&#xFF0C;&#x548C;&#x5E16;&#x5B50;&#x91CC;&#x7684;&#x5185;&#x5BB9;&#x3002;</p>
<h3 id="itemspy">items.py</h3>
<pre><code class="lang-python"><span class="hljs-keyword">import</span> scrapy

<span class="hljs-class"><span class="hljs-keyword">class</span> <span class="hljs-title">DongguanItem</span><span class="hljs-params">(scrapy.Item)</span>:</span>
    <span class="hljs-comment"># &#x6BCF;&#x4E2A;&#x5E16;&#x5B50;&#x7684;&#x6807;&#x9898;</span>
    title = scrapy.Field()
    <span class="hljs-comment"># &#x6BCF;&#x4E2A;&#x5E16;&#x5B50;&#x7684;&#x7F16;&#x53F7;</span>
    number = scrapy.Field()
    <span class="hljs-comment"># &#x6BCF;&#x4E2A;&#x5E16;&#x5B50;&#x7684;&#x6587;&#x5B57;&#x5185;&#x5BB9;</span>
    content = scrapy.Field()
    <span class="hljs-comment"># &#x6BCF;&#x4E2A;&#x5E16;&#x5B50;&#x7684;url</span>
    url = scrapy.Field()
</code></pre>
<h3 id="spiderssunwzpy">spiders/sunwz.py</h3>
<h6 id="spider-&#x7248;&#x672C;">Spider &#x7248;&#x672C;</h6>
<pre><code class="lang-python"><span class="hljs-comment"># -*- coding: utf-8 -*-</span>

<span class="hljs-keyword">import</span> scrapy
<span class="hljs-keyword">from</span> dongguan.items <span class="hljs-keyword">import</span> DongguanItem

<span class="hljs-class"><span class="hljs-keyword">class</span> <span class="hljs-title">SunSpider</span><span class="hljs-params">(CrawlSpider)</span>:</span>
    name = <span class="hljs-string">&apos;sun&apos;</span>
    allowed_domains = [<span class="hljs-string">&apos;wz.sun0769.com&apos;</span>]
    url = <span class="hljs-string">&apos;http://wz.sun0769.com/index.php/question/questionType?type=4&amp;page=&apos;</span>
    offset = <span class="hljs-number">0</span>
    start_urls = [url + str(offset)]

    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">parse</span><span class="hljs-params">(self, response)</span>:</span>
        <span class="hljs-comment"># &#x53D6;&#x51FA;&#x6BCF;&#x4E2A;&#x9875;&#x9762;&#x91CC;&#x5E16;&#x5B50;&#x94FE;&#x63A5;&#x5217;&#x8868;</span>
        links = response.xpath(<span class="hljs-string">&quot;//div[@class=&apos;greyframe&apos;]/table//td/a[@class=&apos;news14&apos;]/@href&quot;</span>).extract()
        <span class="hljs-comment"># &#x8FED;&#x4EE3;&#x53D1;&#x9001;&#x6BCF;&#x4E2A;&#x5E16;&#x5B50;&#x7684;&#x8BF7;&#x6C42;&#xFF0C;&#x8C03;&#x7528;parse_item&#x65B9;&#x6CD5;&#x5904;&#x7406;</span>
        <span class="hljs-keyword">for</span> link <span class="hljs-keyword">in</span> links:
            <span class="hljs-keyword">yield</span> scrapy.Request(link, callback = self.parse_item)
        <span class="hljs-comment"># &#x8BBE;&#x7F6E;&#x9875;&#x7801;&#x7EC8;&#x6B62;&#x6761;&#x4EF6;&#xFF0C;&#x5E76;&#x4E14;&#x6BCF;&#x6B21;&#x53D1;&#x9001;&#x65B0;&#x7684;&#x9875;&#x9762;&#x8BF7;&#x6C42;&#x8C03;&#x7528;parse&#x65B9;&#x6CD5;&#x5904;&#x7406;</span>
        <span class="hljs-keyword">if</span> self.offset &lt;= <span class="hljs-number">71130</span>:
            self.offset += <span class="hljs-number">30</span>
            <span class="hljs-keyword">yield</span> scrapy.Request(self.url + str(self.offset), callback = self.parse)

    <span class="hljs-comment"># &#x5904;&#x7406;&#x6BCF;&#x4E2A;&#x5E16;&#x5B50;&#x91CC;</span>
    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">parse_item</span><span class="hljs-params">(self, response)</span>:</span>
        item = DongguanItem()
        <span class="hljs-comment"># &#x6807;&#x9898;</span>
        item[<span class="hljs-string">&apos;title&apos;</span>] = response.xpath(<span class="hljs-string">&apos;//div[contains(@class, &quot;pagecenter p3&quot;)]//strong/text()&apos;</span>).extract()[<span class="hljs-number">0</span>]

        <span class="hljs-comment"># &#x7F16;&#x53F7;</span>
        item[<span class="hljs-string">&apos;number&apos;</span>] = item[<span class="hljs-string">&apos;title&apos;</span>].split(<span class="hljs-string">&apos; &apos;</span>)[-<span class="hljs-number">1</span>].split(<span class="hljs-string">&quot;:&quot;</span>)[-<span class="hljs-number">1</span>]

        <span class="hljs-comment"># &#x6587;&#x5B57;&#x5185;&#x5BB9;&#xFF0C;&#x9ED8;&#x8BA4;&#x5148;&#x53D6;&#x51FA;&#x6709;&#x56FE;&#x7247;&#x60C5;&#x51B5;&#x4E0B;&#x7684;&#x6587;&#x5B57;&#x5185;&#x5BB9;&#x5217;&#x8868;</span>
        content = response.xpath(<span class="hljs-string">&apos;//div[@class=&quot;contentext&quot;]/text()&apos;</span>).extract()
        <span class="hljs-comment"># &#x5982;&#x679C;&#x6CA1;&#x6709;&#x5185;&#x5BB9;&#xFF0C;&#x5219;&#x53D6;&#x51FA;&#x6CA1;&#x6709;&#x56FE;&#x7247;&#x60C5;&#x51B5;&#x4E0B;&#x7684;&#x6587;&#x5B57;&#x5185;&#x5BB9;&#x5217;&#x8868;</span>
        <span class="hljs-keyword">if</span> len(content) == <span class="hljs-number">0</span>:
            content = response.xpath(<span class="hljs-string">&apos;//div[@class=&quot;c1 text14_2&quot;]/text()&apos;</span>).extract()
            <span class="hljs-comment"># content&#x4E3A;&#x5217;&#x8868;&#xFF0C;&#x901A;&#x8FC7;join&#x65B9;&#x6CD5;&#x62FC;&#x63A5;&#x4E3A;&#x5B57;&#x7B26;&#x4E32;&#xFF0C;&#x5E76;&#x53BB;&#x9664;&#x9996;&#x5C3E;&#x7A7A;&#x683C;</span>
            item[<span class="hljs-string">&apos;content&apos;</span>] = <span class="hljs-string">&quot;&quot;</span>.join(content).strip()
        <span class="hljs-keyword">else</span>:
            item[<span class="hljs-string">&apos;content&apos;</span>] = <span class="hljs-string">&quot;&quot;</span>.join(content).strip()

        <span class="hljs-comment"># &#x94FE;&#x63A5;</span>
        item[<span class="hljs-string">&apos;url&apos;</span>] = response.url

        <span class="hljs-keyword">yield</span> item
</code></pre>
<h6 id="crawlspider-&#x7248;&#x672C;">CrawlSpider &#x7248;&#x672C;</h6>
<pre><code class="lang-python">
<span class="hljs-comment"># -*- coding: utf-8 -*-</span>
<span class="hljs-keyword">import</span> scrapy
<span class="hljs-keyword">from</span> scrapy.linkextractors <span class="hljs-keyword">import</span> LinkExtractor
<span class="hljs-keyword">from</span> scrapy.spiders <span class="hljs-keyword">import</span> CrawlSpider, Rule
<span class="hljs-keyword">from</span> dongguan.items <span class="hljs-keyword">import</span> DongguanItem
<span class="hljs-keyword">import</span> time


<span class="hljs-class"><span class="hljs-keyword">class</span> <span class="hljs-title">SunSpider</span><span class="hljs-params">(CrawlSpider)</span>:</span>
    name = <span class="hljs-string">&apos;sun&apos;</span>
    allowed_domains = [<span class="hljs-string">&apos;wz.sun0769.com&apos;</span>]
    start_urls = [<span class="hljs-string">&apos;http://wz.sun0769.com/index.php/question/questionType?type=4&amp;page=&apos;</span>]

    <span class="hljs-comment"># &#x6BCF;&#x4E00;&#x9875;&#x7684;&#x5339;&#x914D;&#x89C4;&#x5219;</span>
    pagelink = LinkExtractor(allow=(<span class="hljs-string">&apos;type=4&apos;</span>))
    <span class="hljs-comment"># &#x6BCF;&#x4E2A;&#x5E16;&#x5B50;&#x7684;&#x5339;&#x914D;&#x89C4;&#x5219;</span>
    contentlink = LinkExtractor(allow=<span class="hljs-string">r&apos;/html/question/\d+/\d+.shtml&apos;</span>)

    rules = [
        <span class="hljs-comment"># &#x672C;&#x6848;&#x4F8B;&#x4E3A;&#x7279;&#x6B8A;&#x60C5;&#x51B5;&#xFF0C;&#x9700;&#x8981;&#x8C03;&#x7528;deal_links&#x65B9;&#x6CD5;&#x5904;&#x7406;&#x6BCF;&#x4E2A;&#x9875;&#x9762;&#x91CC;&#x7684;&#x94FE;&#x63A5;</span>
        Rule(pagelink, process_links = <span class="hljs-string">&quot;deal_links&quot;</span>, follow = <span class="hljs-keyword">True</span>),
        Rule(contentlink, callback = <span class="hljs-string">&apos;parse_item&apos;</span>)
    ]

    <span class="hljs-comment"># &#x9700;&#x8981;&#x91CD;&#x65B0;&#x5904;&#x7406;&#x6BCF;&#x4E2A;&#x9875;&#x9762;&#x91CC;&#x7684;&#x94FE;&#x63A5;&#xFF0C;&#x5C06;&#x94FE;&#x63A5;&#x91CC;&#x7684;&#x2018;Type&amp;type=4?page=xxx&#x2019;&#x66FF;&#x6362;&#x4E3A;&#x2018;Type?type=4&amp;page=xxx&#x2019;&#xFF08;&#x6216;&#x8005;&#x662F;Type&amp;page=xxx?type=4&#x2019;&#x66FF;&#x6362;&#x4E3A;&#x2018;Type?page=xxx&amp;type=4&#x2019;&#xFF09;&#xFF0C;&#x5426;&#x5219;&#x65E0;&#x6CD5;&#x53D1;&#x9001;&#x8FD9;&#x4E2A;&#x94FE;&#x63A5;</span>
    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">deal_links</span><span class="hljs-params">(self, links)</span>:</span>
        <span class="hljs-keyword">for</span> link <span class="hljs-keyword">in</span> links:
            link.url = link.url.replace(<span class="hljs-string">&quot;?&quot;</span>,<span class="hljs-string">&quot;&amp;&quot;</span>).replace(<span class="hljs-string">&quot;Type&amp;&quot;</span>, <span class="hljs-string">&quot;Type?&quot;</span>)
            <span class="hljs-keyword">print</span> link.url
        <span class="hljs-keyword">return</span> links


    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">parse_item</span><span class="hljs-params">(self, response)</span>:</span>
        <span class="hljs-keyword">print</span> response.url
        item = DongguanItem()
        <span class="hljs-comment"># &#x6807;&#x9898;</span>
        item[<span class="hljs-string">&apos;title&apos;</span>] = response.xpath(<span class="hljs-string">&apos;//div[contains(@class, &quot;pagecenter p3&quot;)]//strong/text()&apos;</span>).extract()[<span class="hljs-number">0</span>]

        <span class="hljs-comment"># &#x7F16;&#x53F7;</span>
        item[<span class="hljs-string">&apos;number&apos;</span>] = item[<span class="hljs-string">&apos;title&apos;</span>].split(<span class="hljs-string">&apos; &apos;</span>)[-<span class="hljs-number">1</span>].split(<span class="hljs-string">&quot;:&quot;</span>)[-<span class="hljs-number">1</span>]

        <span class="hljs-comment"># &#x6587;&#x5B57;&#x5185;&#x5BB9;&#xFF0C;&#x9ED8;&#x8BA4;&#x5148;&#x53D6;&#x51FA;&#x6709;&#x56FE;&#x7247;&#x60C5;&#x51B5;&#x4E0B;&#x7684;&#x6587;&#x5B57;&#x5185;&#x5BB9;&#x5217;&#x8868;</span>
        content = response.xpath(<span class="hljs-string">&apos;//div[@class=&quot;contentext&quot;]/text()&apos;</span>).extract()
        <span class="hljs-comment"># &#x5982;&#x679C;&#x6CA1;&#x6709;&#x5185;&#x5BB9;&#xFF0C;&#x5219;&#x53D6;&#x51FA;&#x6CA1;&#x6709;&#x56FE;&#x7247;&#x60C5;&#x51B5;&#x4E0B;&#x7684;&#x6587;&#x5B57;&#x5185;&#x5BB9;&#x5217;&#x8868;</span>
        <span class="hljs-keyword">if</span> len(content) == <span class="hljs-number">0</span>:
            content = response.xpath(<span class="hljs-string">&apos;//div[@class=&quot;c1 text14_2&quot;]/text()&apos;</span>).extract()
            <span class="hljs-comment"># content&#x4E3A;&#x5217;&#x8868;&#xFF0C;&#x901A;&#x8FC7;join&#x65B9;&#x6CD5;&#x62FC;&#x63A5;&#x4E3A;&#x5B57;&#x7B26;&#x4E32;&#xFF0C;&#x5E76;&#x53BB;&#x9664;&#x9996;&#x5C3E;&#x7A7A;&#x683C;</span>
            item[<span class="hljs-string">&apos;content&apos;</span>] = <span class="hljs-string">&quot;&quot;</span>.join(content).strip()
        <span class="hljs-keyword">else</span>:
            item[<span class="hljs-string">&apos;content&apos;</span>] = <span class="hljs-string">&quot;&quot;</span>.join(content).strip()

        <span class="hljs-comment"># &#x94FE;&#x63A5;</span>
        item[<span class="hljs-string">&apos;url&apos;</span>] = response.url

        <span class="hljs-keyword">yield</span> item
</code></pre>
<h4 id="pipelinespy">pipelines.py</h4>
<pre><code class="lang-python"><span class="hljs-comment"># -*- coding: utf-8 -*-</span>

<span class="hljs-comment"># &#x6587;&#x4EF6;&#x5904;&#x7406;&#x7C7B;&#x5E93;&#xFF0C;&#x53EF;&#x4EE5;&#x6307;&#x5B9A;&#x7F16;&#x7801;&#x683C;&#x5F0F;</span>
<span class="hljs-keyword">import</span> codecs
<span class="hljs-keyword">import</span> json

<span class="hljs-class"><span class="hljs-keyword">class</span> <span class="hljs-title">JsonWriterPipeline</span><span class="hljs-params">(object)</span>:</span>

    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">__init__</span><span class="hljs-params">(self)</span>:</span>
        <span class="hljs-comment"># &#x521B;&#x5EFA;&#x4E00;&#x4E2A;&#x53EA;&#x5199;&#x6587;&#x4EF6;&#xFF0C;&#x6307;&#x5B9A;&#x6587;&#x672C;&#x7F16;&#x7801;&#x683C;&#x5F0F;&#x4E3A;utf-8</span>
        self.filename = codecs.open(<span class="hljs-string">&apos;sunwz.json&apos;</span>, <span class="hljs-string">&apos;w&apos;</span>, encoding=<span class="hljs-string">&apos;utf-8&apos;</span>)

    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">process_item</span><span class="hljs-params">(self, item, spider)</span>:</span>
        content = json.dumps(dict(item), ensure_ascii=<span class="hljs-keyword">False</span>) + <span class="hljs-string">&quot;\n&quot;</span>
        self.filename.write(content)
        <span class="hljs-keyword">return</span> item

    <span class="hljs-function"><span class="hljs-keyword">def</span> <span class="hljs-title">spider_closed</span><span class="hljs-params">(self, spider)</span>:</span>
        self.file.close()
</code></pre>
<h4 id="settingspy">settings.py</h4>
<pre><code class="lang-python">ITEM_PIPELINES = {
    <span class="hljs-string">&apos;dongguan.pipelines.DongguanPipeline&apos;</span>: <span class="hljs-number">300</span>,
}

<span class="hljs-comment"># &#x65E5;&#x5FD7;&#x6587;&#x4EF6;&#x540D;&#x548C;&#x5904;&#x7406;&#x7B49;&#x7EA7;</span>
LOG_FILE = <span class="hljs-string">&quot;dg.log&quot;</span>
LOG_LEVEL = <span class="hljs-string">&quot;DEBUG&quot;</span>
</code></pre>
<h4 id="&#x5728;&#x9879;&#x76EE;&#x6839;&#x76EE;&#x5F55;&#x4E0B;&#x65B0;&#x5EFA;mainpy&#x6587;&#x4EF6;&#x7528;&#x4E8E;&#x8C03;&#x8BD5;">&#x5728;&#x9879;&#x76EE;&#x6839;&#x76EE;&#x5F55;&#x4E0B;&#x65B0;&#x5EFA;main.py&#x6587;&#x4EF6;,&#x7528;&#x4E8E;&#x8C03;&#x8BD5;</h4>
<pre><code class="lang-python"><span class="hljs-keyword">from</span> scrapy <span class="hljs-keyword">import</span> cmdline
cmdline.execute(<span class="hljs-string">&apos;scrapy crawl sunwz&apos;</span>.split())
</code></pre>
<h4 id="&#x6267;&#x884C;&#x7A0B;&#x5E8F;">&#x6267;&#x884C;&#x7A0B;&#x5E8F;</h4>
<pre><code>py2 main.py
</code></pre><footer class="page-footer"><span class="copyright">Copyright &#xA9; BigCat all right reserved&#xFF0C;powered by Gitbook</span><span class="footer-modification">&#x300C;Revision Time:
2017-02-04 00:50:22&#x300D;
</span></footer>
                    
                    </section>
                
                
                </div>
            </div>
        </div>

        
        <a href="../../file/part05/5.1.html" class="navigation navigation-prev " aria-label="Previous page: (案例一)手机App抓包爬虫"><i class="fa fa-angle-left"></i></a>
        
        
        <a href="../../file/part05/5.3.html" class="navigation navigation-next " aria-label="Next page: (案例三)新浪网分类资讯爬虫"><i class="fa fa-angle-right"></i></a>
        
    </div>
</div>

        
<script src="../../gitbook/app.js"></script>

    
    <script src="../../gitbook/plugins/gitbook-plugin-splitter/splitter.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-toggle-chapters/toggle.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-fontsettings/buttons.js"></script>
    

    
    <script src="../../gitbook/plugins/gitbook-plugin-livereload/plugin.js"></script>
    

<script>
require(["gitbook"], function(gitbook) {
    var config = {"disqus":{"shortName":"gitbookuse"},"github":{"url":"https://github.com/dododream"},"search-pro":{"cutWordLib":"nodejieba","defineWord":["gitbook-use"]},"sharing":{"weibo":true,"facebook":true,"twitter":true,"google":false,"instapaper":false,"vk":false,"all":["facebook","google","twitter","weibo","instapaper"]},"tbfed-pagefooter":{"copyright":"Copyright © BigCat","modify_label":"「Revision Time:","modify_format":"YYYY-MM-DD HH:mm:ss」"},"baidu":{"token":"ff100361cdce95dd4c8fb96b4009f7bc"},"sitemap":{"hostname":"http://www.treenewbee.top"},"donate":{"wechat":"http://weixin.png","alipay":"http://alipay.png","title":"","button":"赏","alipayText":"支付宝打赏","wechatText":"微信打赏"},"edit-link":{"base":"https://github.com/dododream/edit","label":"Edit This Page"},"splitter":{},"toggle-chapters":{},"highlight":{},"fontsettings":{"theme":"white","family":"sans","size":2},"livereload":{}};
    gitbook.start(config);
});
</script>

        <!-- body:end -->
    </body>
    <!-- End of book Python爬虫课程讲义 -->
</html>
